In [1]:
import numpy as np
import pandas as pd
import six
import tensorflow as tf
import time
import os
from tqdm import tqdm

import random
import string

from keras.callbacks import EarlyStopping, ModelCheckpoint

%matplotlib inline

Using TensorFlow backend.


In [2]:
DATA_PATH = 'data/descriptions.pickle'

tf.logging.set_verbosity(tf.logging.INFO)

def transform(txt, pad_to=None):
    # drop any non-ascii characters
    output = np.asarray([ord(c) for c in txt if ord(c) < 255], dtype=np.int32)
    if pad_to is not None:
        output = output[:pad_to]
        output = np.concatenate([
            np.zeros([pad_to - len(txt)], dtype=np.int32),
            output
        ])
    return output

def training_generator(seq_len=100, batch_size=1024):
    """A generator yields (source, target) arrays for training."""
    names_raw, descs_raw = pd.read_pickle(DATA_PATH)
    txt = '\n'.join(names_raw)

    tf.logging.info('Input text [%d] %s', len(txt), txt[:50])
    source = transform(txt)
    while True:
        offsets = np.random.randint(0, len(source) - seq_len, batch_size)

        # Our model uses sparse crossentropy loss, but Keras requires labels
        # to have the same rank as the input logits.  We add an empty final
        # dimension to account for this.
        yield (
            np.stack([source[idx:idx + seq_len] for idx in offsets]),
            np.expand_dims(
                np.stack([source[idx + 1:idx + seq_len + 1] for idx in offsets]), 
                -1),
        )

six.next(training_generator(seq_len=10, batch_size=1))

INFO:tensorflow:Input text [630341] Hall Napa Valley Cabernet Sauvignon 2013 
Rombauer


(array([[ 97, 114, 105, 110, 111,  32,  50,  48,  49,  55]]), array([[[114],
         [105],
         [110],
         [111],
         [ 32],
         [ 50],
         [ 48],
         [ 49],
         [ 55],
         [ 32]]]))

In [3]:
EMBEDDING_DIM = 512

def lstm_model(seq_len=100, batch_size=None, stateful=True):
    """Language model: predict the next word given the current word."""
    source = tf.keras.Input(
        name='seed', shape=(seq_len,), batch_size=batch_size, dtype=tf.int32)

    embedding = tf.keras.layers.Embedding(input_dim=256, output_dim=EMBEDDING_DIM)(source)
    lstm_1 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(embedding)
    lstm_2 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(lstm_1)
    #drop_1 = tf.keras.layers.Dropout(0.2)
    predicted_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(256, activation='softmax'))(lstm_2)
    model = tf.keras.Model(inputs=[source], outputs=[predicted_char])
    model.compile(
        optimizer=tf.train.RMSPropOptimizer(learning_rate=0.01),
        #optimizer=tf.keras.optimizers.RMSprop(lr=0.01),
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'])
    
    return model

In [None]:
tf.keras.backend.clear_session()

training_model = lstm_model(seq_len=100, batch_size=1024, stateful=False)
#training_model.load_weights('model_small_chkpt.h5', by_name=True)

checkpoint = ModelCheckpoint('model_small_v2_chkpt.h5', 
                             monitor='sparse_categorical_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='max')
early_stopping = EarlyStopping(monitor='sparse_categorical_accuracy',
                               patience=3,
                               mode='max')
callbacks_list = [checkpoint,early_stopping]


training_model.fit_generator(
    training_generator(seq_len=100, batch_size=1024),
    steps_per_epoch=100,
    epochs=5,
    callbacks = callbacks_list
    )

training_model.save_weights('model_small_v2_weights.h5', overwrite=True)

Epoch 1/5
INFO:tensorflow:Input text [4733481] Dark garnet in color, the 2013 HALL Napa Valley Ca

In [None]:
training_model.summary()

In [None]:
from keras.models import load_model

training_model.save('model_and_weights.h5')

In [None]:
# list all data in history
import matplotlib.pyplot as plt

training_acc = training_model.history.history['sparse_categorical_accuracy']
plt.figure(dpi=150)
plt.title("Accuracy per Epoch")
plt.plot(range(len(training_acc)),  training_acc)
plt.show()

## 5 Epochs

In [4]:
BATCH_SIZE = 5
PREDICT_LEN = 250

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights('model_weights.h5')

# We seed the model with our initial string, copied BATCH_SIZE times

seed_txt = 'This wine tastes like '
seed_txt = ''.join(random.choices(string.ascii_uppercase + string.digits, k=20))
seed = transform(seed_txt)
seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

# First, run the seed forward to prime the state of the model.
prediction_model.reset_states()
for i in range(len(seed_txt) - 1):
    prediction_model.predict(seed[:, i:i + 1])

# Now we can accumulate predictions!
predictions = [seed[:, -1:]]
for i in range(PREDICT_LEN):
    last_word = predictions[-1]
    next_probits = prediction_model.predict(last_word)[:, 0, :]
  
  # sample from our output distribution
    next_idx = [
        np.random.choice(256, p=next_probits[i])
        for i in range(BATCH_SIZE)
    ]
    predictions.append(np.asarray(next_idx, dtype=np.int32))
    
for i in range(BATCH_SIZE):
    print('PREDICTION %d\n\n' % i)
    p = [predictions[j][i] for j in range(PREDICT_LEN)]
    generated = ''.join([chr(c) for c in p])
    print(generated)
    print()
    assert len(generated) == PREDICT_LEN, 'Generated text too short'

PREDICTION 0


HO Cabernet Franc 2016 
d'Arenberg The Copperst Regniere Vin Foo Notre (Futures Pre-Sale) 2017 
Duckhorn Napa Valley Merlot (1.5 Liter Magnum) 1949 
Amuse Bouche Cotes du Rhone St. Esprit Rose 2006 
Row Salentino Amabillo Alpasso Cuvee Uniello 0000 


PREDICTION 1


HOSham Rose 2016 
Ashtor Pinot Noir 2015 
Au Contray Moscato 2015 
MacRostie Sonoma Coast Pinot Noir 2017 
Domaine Emile Beyer Riesling Dry 2015 
Domaine de Grand Rose Barrel Select Sauvignon Blanc 2017 
7 Mt.L. Vineyard Hollow Creek Chardonnay 2016 

PREDICTION 2


HO Cabernet Sauvignon 2014 
Torres Verdeo Vigna Ciabot Camps 1 Charbonobrens Reserve Pinot Noir 2013 
Jean-Charles Fagot Les Rias Les Caillerets Premier Cru 2012 
Vina Just 2014 
Colgin Tychson Hill Vineyard Cabernet Sauvignon 2013 
Feudo di Santa Tr

PREDICTION 3


HOShat Faugeres 2014 
Chacewater Winery Cabernet Sauvignon 2015 
Four Graces Shiraz 2016 
Lucas &amp; Lewellen Zinfandel 2014 
Esk Valley Sauvignon Blanc 2015 
Northstar Columbia Valle

## 40 Epochs

In [5]:
BATCH_SIZE = 5
PREDICT_LEN = 150

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights('model_weights.h5')

fake_names = []
for ii in tqdm(range(100)):
    # We seed the model with our initial string, copied BATCH_SIZE times
    seed_txt = 'This wine tastes like '
    seed_txt = ''.join(random.choices(string.ascii_uppercase + string.digits, k=20))
    seed = transform(seed_txt)
    seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

    # First, run the seed forward to prime the state of the model.
    prediction_model.reset_states()
    for i in range(len(seed_txt) - 1):
        prediction_model.predict(seed[:, i:i + 1])

    # Now we can accumulate predictions!
    predictions = [seed[:, -1:]]
    for i in range(PREDICT_LEN):
        last_word = predictions[-1]
        next_probits = prediction_model.predict(last_word)[:, 0, :]

      # sample from our output distribution
        next_idx = [
            np.random.choice(256, p=next_probits[i])
            for i in range(BATCH_SIZE)
        ]
        predictions.append(np.asarray(next_idx, dtype=np.int32))

    for i in range(BATCH_SIZE):
        #print('PREDICTION %d\n\n' % i)
        p = [predictions[j][i] for j in range(PREDICT_LEN)]
        generated = ''.join([chr(c) for c in p])
        gen_list = generated.split('\n')[1:-1]
        for item in gen_list:
            fake_names.append(item)
        assert len(generated) == PREDICT_LEN, 'Generated text too short'

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:41<00:00,  2.36it/s]


In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

real_names, descs_raw = pd.read_pickle(DATA_PATH)

fake_scores = {}
for f_name in tqdm(fake_names):
    max_score = 0.0
    for r_name in real_names:
        if similar(f_name,r_name) > max_score:
            max_score = similar(f_name,r_name)
    fake_scores[f_name] = max_score
        
    
len(fake_names_2)
pd.Series(fake_scores).to_csv('data/fake_scores.csv')

 23%|███████████████████████████████████                                                                                                                         | 289/1284 [13:13<45:55,  2.77s/it]

## OLD

In [None]:
BATCH_SIZE = 5
PREDICT_LEN = 250

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights('/tmp/bard.h5')

# We seed the model with our initial string, copied BATCH_SIZE times

seed_txt = 'Looks it not like the king?  Verily, we must go! '
seed = transform(seed_txt)
seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

# First, run the seed forward to prime the state of the model.
prediction_model.reset_states()
for i in range(len(seed_txt) - 1):
    prediction_model.predict(seed[:, i:i + 1])

# Now we can accumulate predictions!
predictions = [seed[:, -1:]]
for i in range(PREDICT_LEN):
    last_word = predictions[-1]
    next_probits = prediction_model.predict(last_word)[:, 0, :]
  
  # sample from our output distribution
    next_idx = [
        np.random.choice(256, p=next_probits[i])
        for i in range(BATCH_SIZE)
    ]
    predictions.append(np.asarray(next_idx, dtype=np.int32))
    
for i in range(BATCH_SIZE):
    print('PREDICTION %d\n\n' % i)
    p = [predictions[j][i] for j in range(PREDICT_LEN)]
    generated = ''.join([chr(c) for c in p])
    print(generated)
    print()
    assert len(generated) == PREDICT_LEN, 'Generated text too short'