In [2]:
import numpy as np
import pandas as pd
import six
import tensorflow as tf
import time
import os
from tqdm import tqdm

import random
import string

import gpt_2_simple as gpt2

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
tf.logging.set_verbosity(tf.logging.INFO)

os.environ["CUDA_VISIBLE_DEVICES"]="0";  

%matplotlib inline

EMBEDDING_DIM = 512
SCRAPED_DATA_PATH = 'data/scraped/descriptions.pickle'
FAKE_NAMES_PATH = 'data/fake/fake_names_12949.pickle'
MODEL_WEIGHTS_PATH = 'data/models_weights/model_description_weights.h5'


Using TensorFlow backend.


In [8]:
names_raw, descs_raw = pd.read_pickle(SCRAPED_DATA_PATH)
pd.Series(descs_raw).to_csv("data/scraped/descriptions.txt")

  


In [9]:
#gpt2.download_gpt2()   # model is saved into current directory under /models/117M/

sess = gpt2.start_tf_sess()
gpt2.finetune(sess, 'data/scraped/descriptions.txt', steps=1000)   # steps is max number of training steps

#gpt2.generate(sess)

Loading checkpoint models\117M\model.ckpt
INFO:tensorflow:Restoring parameters from models\117M\model.ckpt
Loading dataset...


  0%|                                                                                                                       | 0/1 [00:00<?, ?it/s]


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 7972: character maps to <undefined>

In [7]:
# Convert text to arrays of letters represented as integers
def transform(txt, pad_to=None):
    # drop any non-ascii characters
    output = np.asarray([ord(c) for c in txt if ord(c) < 255], dtype=np.int32)
    if pad_to is not None:
        output = output[:pad_to]
        output = np.concatenate([
            np.zeros([pad_to - len(txt)], dtype=np.int32),
            output
        ])
    return output

# How the characters will be fed into the model
def training_generator(seq_len=100, batch_size=1024):
    """A generator yields (source, target) arrays for training."""
    names_raw, descs_raw = pd.read_pickle(SCRAPED_DATA_PATH)
    txt = '\n'.join(descs_raw)
    tf.logging.info('Input text [%d] %s', len(txt), txt[:50])
    source = transform(txt)
    while True:
        offsets = np.random.randint(0, len(source) - seq_len, batch_size)
        yield (
            np.stack([source[idx:idx + seq_len] for idx in offsets]),
            np.expand_dims(
                np.stack([source[idx + 1:idx + seq_len + 1] for idx in offsets]), 
                -1),
        )

In [8]:
def lstm_model(seq_len=100, batch_size=None, stateful=True):
    """Language model: predict the next word given the current word."""
    source = tf.keras.Input(
        name='seed', shape=(seq_len,), batch_size=batch_size, dtype=tf.int32)

    embedding = tf.keras.layers.Embedding(input_dim=256, output_dim=EMBEDDING_DIM)(source)
    lstm_1 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(embedding)
    lstm_2 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(lstm_1)
    #drop_1 = tf.keras.layers.Dropout(0.2)
    predicted_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(256, activation='softmax'))(lstm_2)
    model = tf.keras.Model(inputs=[source], outputs=[predicted_char])
    #model = tf.keras.utils.multi_gpu_model(model, gpus=2)

    model.compile(
        optimizer=tf.train.RMSPropOptimizer(learning_rate=0.01),
        #optimizer=tf.keras.optimizers.RMSprop(lr=0.01),
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'])
    return model

tf.keras.backend.clear_session()
training_model = lstm_model(seq_len=100, batch_size=1024, stateful=False)
#training_model.load_weights('model_small_chkpt.h5', by_name=True)

checkpoint = ModelCheckpoint('data/models_weights/model_char_DESCS_chkpt.h5', 
                             monitor='sparse_categorical_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='max')
early_stopping = EarlyStopping(monitor='sparse_categorical_accuracy',
                               patience=5,
                               mode='max')
callbacks_list = [checkpoint,early_stopping]



training_model.fit_generator(
    training_generator(seq_len=100, batch_size=1024),
    steps_per_epoch=100,
    epochs=50,
    callbacks = callbacks_list
    )

training_model.save_weights(MODEL_WEIGHTS_PATH, overwrite=True)

Epoch 1/50
INFO:tensorflow:Input text [4733481] Dark garnet in color, the 2013 HALL Napa Valley Ca
Epoch 00001: sparse_categorical_accuracy improved from -inf to 0.13997, saving model to data/models_weights/model_char_DESCS_chkpt.h5
Epoch 2/50
Epoch 00002: sparse_categorical_accuracy improved from 0.13997 to 0.15756, saving model to data/models_weights/model_char_DESCS_chkpt.h5
Epoch 3/50
Epoch 00003: sparse_categorical_accuracy improved from 0.15756 to 0.45237, saving model to data/models_weights/model_char_DESCS_chkpt.h5
Epoch 4/50
Epoch 00004: sparse_categorical_accuracy improved from 0.45237 to 0.65604, saving model to data/models_weights/model_char_DESCS_chkpt.h5
Epoch 5/50
Epoch 00005: sparse_categorical_accuracy improved from 0.65604 to 0.70876, saving model to data/models_weights/model_char_DESCS_chkpt.h5
Epoch 6/50
Epoch 00006: sparse_categorical_accuracy improved from 0.70876 to 0.73620, saving model to data/models_weights/model_char_DESCS_chkpt.h5
Epoch 7/50
Epoch 00007: spa

In [9]:
training_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
seed (InputLayer)            (1024, 100)               0         
_________________________________________________________________
embedding (Embedding)        (1024, 100, 512)          131072    
_________________________________________________________________
lstm (LSTM)                  (1024, 100, 512)          2099200   
_________________________________________________________________
lstm_1 (LSTM)                (1024, 100, 512)          2099200   
_________________________________________________________________
time_distributed (TimeDistri (1024, 100, 256)          131328    
Total params: 4,460,800
Trainable params: 4,460,800
Non-trainable params: 0
_________________________________________________________________


# Show sample of created wine descriptions

In [10]:
BATCH_SIZE = 5
PREDICT_LEN = 350
EMBEDDING_DIM = 512
MODEL_WEIGHTS_PATH = 'data/models_weights/model_description_weights.h5'

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights(MODEL_WEIGHTS_PATH)

# We seed the model with our initial string, copied BATCH_SIZE times
seed_txt = 'This wine tastes like '
seed_txt = ''.join(random.choices(string.ascii_uppercase + string.digits, k=20))
seed = transform(seed_txt)
seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

# First, run the seed forward to prime the state of the model.
prediction_model.reset_states()
for i in range(len(seed_txt) - 1):
    prediction_model.predict(seed[:, i:i + 1])

# Now we can accumulate predictions!
predictions = [seed[:, -1:]]
for i in range(PREDICT_LEN):
    last_word = predictions[-1]
    next_probits = prediction_model.predict(last_word)[:, 0, :]
  
  # sample from our output distribution
    next_idx = [
        np.random.choice(256, p=next_probits[i])
        for i in range(BATCH_SIZE)
    ]
    predictions.append(np.asarray(next_idx, dtype=np.int32))
    
for i in range(BATCH_SIZE):
    print('PREDICTION %d\n\n' % i)
    p = [predictions[j][i] for j in range(PREDICT_LEN)]
    generated = ''.join([chr(c) for c in p])
    print(generated)
    print()
    assert len(generated) == PREDICT_LEN, 'Generated text too short'

PREDICTION 0


BFres Pea or California.
Who Franci Megrey aimon facility that's from which Mrisas prosish, this wine is concentrated sharp and sweetness.
Deep and subtle mix of figs and fresh pith. A spicy peach with spicy peach, pineapple and passion fruit. The palate is rich and aromatized and texture. The nose presents aromas of red cherry and plums or finesse

PREDICTION 1


BA Right Crosselt's wine enthusiasts in Magdaleto, original premier cru spent to grow, Chardonnay and Zinfandel in Sonomas Russian River Valley and to retain from the concrete. it is a rare of steely climate fully better sits gentle date between lofts to drink.
Since 1880, Maison Joseph Drouhin heritang year hold yees long toward a number of vineya

PREDICTION 2


B'l Latour years ago, it would be a great place to calca however the flavor of the Herriard one of the most varieties, including Marsanne's displays more vanilla bean and citrus notes, hints of fresh spice and roasted hazelnut. 


Le Secret is the ide

# Create larger fake wine description list

In [11]:
BATCH_SIZE = 1
PREDICT_LEN = 600
N_PREDICTIONS = 100
EMBEDDING_DIM = 512
MODEL_WEIGHTS_PATH = 'data/models_weights/model_description_weights.h5'


# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights(MODEL_WEIGHTS_PATH)
predicted_names = pd.read_csv('data/fake/NAMES_v1.csv')

N_PREDICTIONS = len(predicted_names)

fake_NAME = []
fake_DESC = []
for ii in tqdm(range(N_PREDICTIONS)):
    # We seed the model with our initial string, copied BATCH_SIZE times
    #seed_array = np.zeros(shape=(BATCH_SIZE,))
    for i in range(BATCH_SIZE):
        seed_txt = predicted_names['name'][ii+i]
        seed = transform(seed_txt)
        #print(seed.shape)
    seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

    # First, run the seed forward to prime the state of the model.
    prediction_model.reset_states()
    for i in range(len(seed_txt) - 1):
        prediction_model.predict(seed[:, i:i + 1])

    # Now we can accumulate predictions!
    predictions = [seed[:, -1:]]
    for i in range(PREDICT_LEN):
        last_word = predictions[-1]
        next_probits = prediction_model.predict(last_word)[:, 0, :]

      # sample from our output distribution
        next_idx = [
            np.random.choice(256, p=next_probits[i])
            for i in range(BATCH_SIZE)
        ]
        predictions.append(np.asarray(next_idx, dtype=np.int32))

    for i in range(BATCH_SIZE):
        #print('PREDICTION %d\n\n' % i)
        p = [predictions[j][i] for j in range(PREDICT_LEN)]
        generated = ''.join([chr(c) for c in p])
        #print(generated)
        #print()
        gen_list = generated.split('.')[1:-1]
        gen_conc = ' '.join(gen_list) + '.'
        fake_NAME.append(seed_txt)
        fake_DESC.append(gen_conc)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1275/1275 [35:03<00:00,  1.63s/it]


In [19]:
pd.DataFrame({'name'        : fake_NAME,
              'description' : fake_DESC})\
    .to_excel('data/fake/names_descriptions.xlsx', index=False,
             engine='xlsxwriter')