In [5]:
import numpy as np
import pandas as pd
import six
import keras as k 
import time
import os
from tqdm import tqdm
import simplejson

import random
import string

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model, model_from_json

print(k.backend.tensorflow_backend._get_available_gpus())
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

%matplotlib inline
print("Loaded.")

['/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1']
Loaded.


In [6]:
DATA_PATH = 'data/pickles/descriptions.pickle'

#tf.logging.set_verbosity(tf.logging.INFO)

def transform(txt, pad_to=None):
    # drop any non-ascii characters
    output = np.asarray([ord(c) for c in txt if ord(c) < 255], dtype=np.int32)
    if pad_to is not None:
        output = output[:pad_to]
        output = np.concatenate([
            np.zeros([pad_to - len(txt)], dtype=np.int32),
            output
        ])
    return output

def training_generator(seq_len=100, batch_size=1024):
    """A generator yields (source, target) arrays for training."""
    names_raw, descs_raw = pd.read_pickle(DATA_PATH)
    txt = '\n'.join(descs_raw)

    print('Input text [%d] %s', len(txt), txt[:50])
    source = transform(txt)
    while True:
        offsets = np.random.randint(0, len(source) - seq_len, batch_size)

        # Our model uses sparse crossentropy loss, but Keras requires labels
        # to have the same rank as the input logits.  We add an empty final
        # dimension to account for this.
        yield (
            np.stack([source[idx:idx + seq_len] for idx in offsets]),
            np.expand_dims(
                np.stack([source[idx + 1:idx + seq_len + 1] for idx in offsets]), 
                -1),
        )

six.next(training_generator(seq_len=10, batch_size=1))

Input text [%d] %s 4733481 Dark garnet in color, the 2013 HALL Napa Valley Ca


(array([[104,  98, 111, 114, 105, 110, 103,  32,  83,  97]]), array([[[ 98],
         [111],
         [114],
         [105],
         [110],
         [103],
         [ 32],
         [ 83],
         [ 97],
         [110]]]))

In [12]:
EMBEDDING_DIM = 512
from keras.layers import LSTM, Input, Embedding
from keras.models import Sequential
import keras as k
import tensorflow as tf

def lstm_model(seq_len=200, batch_size=None, stateful=True):
    """Language model: predict the next word given the current word."""
    model = k.models.Sequential()
    model.add(k.layers.Input(name='seed', shape=(seq_len,), batch_size=batch_size, dtype=tf.int32))
    model.add(k.layers.Embedding(input_dim=256, output_dim=EMBEDDING_DIM))
    model.add(k.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True))
    model.add(k.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True))
    #drop_1 = tf.keras.layers.Dropout(0.2)
    predicted_char = k.layers.TimeDistributed(k.layers.Dense(256, activation='softmax'))(lstm_2)
    model = tf.keras.Model(inputs=[source], outputs=[predicted_char])
    model.compile(
        optimizer=tf.train.RMSPropOptimizer(learning_rate=0.01),
        #optimizer=tf.keras.optimizers.RMSprop(lr=0.01),
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'])
    
    return model

def lstm_model(seq_len=200, batch_size=None, stateful=True):
    print(batch_size)
    print(seq_len)
    """Language model: predict the next word given the current word."""
    source = k.Input(name='seed', batch_shape=(batch_size, seq_len), dtype=tf.int32)
    embedding = k.layers.Embedding(input_dim=256, output_dim=EMBEDDING_DIM)(source)
    lstm_1 = k.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(embedding)
    lstm_2 = k.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(lstm_1)
    #drop_1 = tf.keras.layers.Dropout(0.2)
    predicted_char = k.layers.TimeDistributed(k.layers.Dense(256, activation='softmax'))(lstm_2)
    model = k.Model(inputs=[source], outputs=[predicted_char])
    return model

tf.keras.backend.clear_session()

training_model = lstm_model(seq_len=200, batch_size=1024, stateful=False)
#with open('data/models_weights/model_char_DESCS.json', 'r') as temp:
#    training_model = model_from_json(temp.read())
#temp.read()
#temp.close()
#training_model = model_from_json(training_model)
training_model.compile(optimizer=k.optimizers.RMSprop(lr=0.01),
                       loss='sparse_categorical_crossentropy',
                       metrics=['sparse_categorical_accuracy'])
#training_model.load_weights('data/models_weights/model_char_DESCS_weights.h5')

1024
200


In [13]:
# serialize model to JSON
#model_json = training_model.to_json()
#with open("data/models_weights/model_char_DESCS.json", "w") as json_file:
#    json_file.write(simplejson.dumps(simplejson.loads(model_json), indent=4))

# TRAIN
checkpoint = ModelCheckpoint('model_char_DESCS_chkpt_v2.h5', 
                             monitor='sparse_categorical_accuracy', 
                             save_best_only=True, 
                             mode='max')
early_stopping = EarlyStopping(monitor='sparse_categorical_accuracy',
                               patience=3,
                               mode='max')
callbacks_list = [checkpoint,early_stopping]
training_model.fit_generator(
    training_generator(seq_len=200, batch_size=1024),
    steps_per_epoch=200,
    epochs=2,
    callbacks = callbacks_list
    )
#training_model.save_weights('model_char_DESCS_weights_v2.h5', overwrite=True)

Epoch 1/2
Input text [%d] %s 4733481 Dark garnet in color, the 2013 HALL Napa Valley Ca


InvalidArgumentError: Tensor seed:0, specified in either feed_devices or fetch_devices was not found in the Graph

In [9]:
training_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
seed (InputLayer)            (1024, 200)               0         
_________________________________________________________________
embedding (Embedding)        (1024, 200, 512)          131072    
_________________________________________________________________
lstm (LSTM)                  (1024, 200, 512)          2099200   
_________________________________________________________________
lstm_1 (LSTM)                (1024, 200, 512)          2099200   
_________________________________________________________________
time_distributed (TimeDistri (1024, 200, 256)          131328    
Total params: 4,460,800
Trainable params: 4,460,800
Non-trainable params: 0
_________________________________________________________________


In [16]:
# list all data in history
import matplotlib.pyplot as plt

training_acc = training_model.history.history['sparse_categorical_accuracy']
plt.figure(dpi=150)
plt.title("Accuracy per Epoch")
plt.plot(range(len(training_acc)),  training_acc)
plt.show()

AttributeError: 'Model' object has no attribute 'history'

In [14]:

# serialize model to JSON
model_json = training_model.to_json()
with open("data/models_weights/model_char_DESCS.json", "w") as json_file:
    json_file.write(simplejson.dumps(simplejson.loads(model_json), indent=4))

## 5 Epochs

In [10]:
BATCH_SIZE = 5
PREDICT_LEN = 250

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights('data/models_weights/model_char_DESCS_weights.h5')

# We seed the model with our initial string, copied BATCH_SIZE times

seed_txt = 'This wine tastes like '
seed_txt = ''.join(random.choices(string.ascii_uppercase + string.digits, k=20))
seed = transform(seed_txt)
seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

# First, run the seed forward to prime the state of the model.
prediction_model.reset_states()
for i in range(len(seed_txt) - 1):
    prediction_model.predict(seed[:, i:i + 1])

# Now we can accumulate predictions!
predictions = [seed[:, -1:]]
for i in range(PREDICT_LEN):
    last_word = predictions[-1]
    next_probits = prediction_model.predict(last_word)[:, 0, :]
  
  # sample from our output distribution
    next_idx = [
        np.random.choice(256, p=next_probits[i])
        for i in range(BATCH_SIZE)
    ]
    predictions.append(np.asarray(next_idx, dtype=np.int32))
    
for i in range(BATCH_SIZE):
    print('PREDICTION %d\n\n' % i)
    p = [predictions[j][i] for j in range(PREDICT_LEN)]
    generated = ''.join([chr(c) for c in p])
    print(generated)
    print()
    assert len(generated) == PREDICT_LEN, 'Generated text too short'

PREDICTION 0


V Pinots in Napa Valley, which sees an innovative and a Pinot Noir the Catena of the Jurava and in 1992, and has been the new winery is one of the aims at the establish foeser vesselton Cellars produces worldwide in Mendoza.
A time to create Alexande

PREDICTION 1


V Tempranillo store a sense of place the first impresses on approachable as the final blend, dry-farmed autown winemakers also unclos daughter and the same vines by a chewy texture while exotic fruit and out finish that end to the sunny family estate

PREDICTION 2


V Pinot Noir grapes, this Chateauneuf-du-Pape is one of Grenache and a continue the distinct cellar to Frank Fador and a classic Paso Robles was founded in Sonoma County's Knid and Shiraz. Born in the worlds, used a four most renowned hot, the around

PREDICTION 3


V Pinot Noir shows flavor with a wide with a dusty structure and elegance.
Popilia Cook Cellar Carignan has a breaks the estate wines, dedicated near the southern box of vineyard langu

## 40 Epochs

In [None]:
BATCH_SIZE = 5
PREDICT_LEN = 150

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights('model_weights.h5')

fake_names = []
for ii in tqdm(range(100)):
    # We seed the model with our initial string, copied BATCH_SIZE times
    seed_txt = 'This wine tastes like '
    seed_txt = ''.join(random.choices(string.ascii_uppercase + string.digits, k=20))
    seed = transform(seed_txt)
    seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

    # First, run the seed forward to prime the state of the model.
    prediction_model.reset_states()
    for i in range(len(seed_txt) - 1):
        prediction_model.predict(seed[:, i:i + 1])

    # Now we can accumulate predictions!
    predictions = [seed[:, -1:]]
    for i in range(PREDICT_LEN):
        last_word = predictions[-1]
        next_probits = prediction_model.predict(last_word)[:, 0, :]

      # sample from our output distribution
        next_idx = [
            np.random.choice(256, p=next_probits[i])
            for i in range(BATCH_SIZE)
        ]
        predictions.append(np.asarray(next_idx, dtype=np.int32))

    for i in range(BATCH_SIZE):
        #print('PREDICTION %d\n\n' % i)
        p = [predictions[j][i] for j in range(PREDICT_LEN)]
        generated = ''.join([chr(c) for c in p])
        gen_list = generated.split('\n')[1:-1]
        for item in gen_list:
            fake_names.append(item)
        assert len(generated) == PREDICT_LEN, 'Generated text too short'

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()



In [None]:
real_names, descs_raw = pd.read_pickle(DATA_PATH)

fake_names_2 = []
for f_name in fake_names:
    max_score = 0.0
    for r_name in real_names:
        if similar(f_name,r_name) > max_score
    
len(fake_names_2)

## OLD

In [None]:
BATCH_SIZE = 5
PREDICT_LEN = 250

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights('/tmp/bard.h5')

# We seed the model with our initial string, copied BATCH_SIZE times

seed_txt = 'Looks it not like the king?  Verily, we must go! '
seed = transform(seed_txt)
seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

# First, run the seed forward to prime the state of the model.
prediction_model.reset_states()
for i in range(len(seed_txt) - 1):
    prediction_model.predict(seed[:, i:i + 1])

# Now we can accumulate predictions!
predictions = [seed[:, -1:]]
for i in range(PREDICT_LEN):
    last_word = predictions[-1]
    next_probits = prediction_model.predict(last_word)[:, 0, :]
  
  # sample from our output distribution
    next_idx = [
        np.random.choice(256, p=next_probits[i])
        for i in range(BATCH_SIZE)
    ]
    predictions.append(np.asarray(next_idx, dtype=np.int32))
    
for i in range(BATCH_SIZE):
    print('PREDICTION %d\n\n' % i)
    p = [predictions[j][i] for j in range(PREDICT_LEN)]
    generated = ''.join([chr(c) for c in p])
    print(generated)
    print()
    assert len(generated) == PREDICT_LEN, 'Generated text too short'