In [1]:
import logging
import numpy as np
import pandas as pd
import six
import tensorflow as tf
import time
import os
import datetime as dt

from tqdm import tqdm
from tensorflow import keras

import random
import string

#from keras.callbacks import EarlyStopping, ModelCheckpoint
#from keras.models import load_model

MODEL_WEIGHTS_PATH = 'data/models_weights/name_model_weights.h5'

# Set log handler
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("modelNames-logger")
log.setLevel(logging.INFO)

# TensorFlow
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
#!pip install tensorflow-gpu==1.13.1

2.3.1
Num GPUs Available:  0


In [2]:
tf.test.is_built_with_cuda()

True

In [3]:
tf.test.is_built_with_gpu_support()

True

In [22]:
def transform(txt, pad_to=None):
    # drop any non-ascii characters
    output = np.asarray([ord(c) for c in txt if ord(c) < 255], dtype=np.int32)
    if pad_to is not None:
        output = output[:pad_to]
        output = np.concatenate([
            np.zeros([pad_to - len(txt)], dtype=np.int32),
            output
        ])
    return output

def training_generator(dataset, seq_len=100, batch_size=1024):
    """A generator yields (source, target) arrays for training."""
    wine_data = pd.read_csv(dataset, sep='\t', header=None)
    wine_data = wine_data[2] # Take just the names for modeling
    txt = '\n'.join(wine_data)

    #tf.logging.info('Input text [%d] %s', len(txt), txt[:50])
    source = transform(txt)
    while True:
        offsets = np.random.randint(0, len(source) - seq_len, batch_size)

        # Our model uses sparse crossentropy loss, but Keras requires labels
        # to have the same rank as the input logits.  We add an empty final
        # dimension to account for this.
        yield (
            np.stack([source[idx:idx + seq_len] for idx in offsets]),
            np.expand_dims(
                np.stack([source[idx + 1:idx + seq_len + 1] for idx in offsets]), 
                -1),
        )

#six.next(training_generator(seq_len=10, batch_size=1))
log.info("Loaded helper functions")

INFO:modelNames-logger:Loaded helper functions


# Define model and train it on dataset

In [23]:
#import os
#os.environ["CUDA_VISIBLE_DEVICES"]="1"

EMBEDDING_DIM = 128
EPOCHS = 25

def lstm_model(seq_len=100, batch_size=None, stateful=True):
    """Language model: predict the next word given the current word."""
    source = tf.keras.Input(
        name='seed', shape=(seq_len,), batch_size=batch_size, dtype=tf.int32)

    embedding = tf.keras.layers.Embedding(input_dim=256, output_dim=EMBEDDING_DIM)(source)
    lstm_1 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(embedding)
    lstm_2 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(lstm_1)
    #drop_1 = tf.keras.layers.Dropout(0.2)
    predicted_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(256, activation='softmax'))(lstm_2)
    model = tf.keras.Model(inputs=[source], outputs=[predicted_char])
    model.compile(
        optimizer='rmsprop',
        #optimizer=tf.keras.optimizers.RMSprop(lr=0.01),
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'])
    
    return model

tf.keras.backend.clear_session()
if 'session' in locals() and session is not None:
    print('Close interactive session')
    session.close()

training_model = lstm_model(seq_len=100, batch_size=1024, stateful=False)
#training_model.load_weights('model_small_chkpt.h5', by_name=True)

checkpoint = keras.callbacks.ModelCheckpoint('model_names_chkpt.h5', 
                             monitor='sparse_categorical_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='max')
early_stopping = keras.callbacks.EarlyStopping(monitor='sparse_categorical_accuracy',
                               patience=3,
                               mode='max')
callbacks_list = [checkpoint,early_stopping]
print(training_model.summary())


SCRAPED_WINES_INPUT_PATH = 'data/scraped/name_desc_nlp_ready.txt'

training_model.fit_generator(
    training_generator(SCRAPED_WINES_INPUT_PATH, seq_len=100, batch_size=1024),
    steps_per_epoch=100,
    epochs=EPOCHS,
    callbacks = callbacks_list,
    verbose = 1
    )

training_model.save_weights(MODEL_WEIGHTS_PATH, overwrite=True)

# 16 = 3.5 loss (2 epoch)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
seed (InputLayer)            [(1024, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (1024, 100, 128)          32768     
_________________________________________________________________
lstm (LSTM)                  (1024, 100, 128)          131584    
_________________________________________________________________
lstm_1 (LSTM)                (1024, 100, 128)          131584    
_________________________________________________________________
time_distributed (TimeDistri (1024, 100, 256)          33024     
Total params: 328,960
Trainable params: 328,960
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/25
  7/100 [=>............................] - ETA: 1:54 - loss: 4.7601 - sparse_categorical_accuracy:

KeyboardInterrupt: 

# Show sample of created wine names

In [11]:
BATCH_SIZE = 5
PREDICT_LEN = 250

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights(MODEL_WEIGHTS_PATH)

seed_txt = ''.join(random.choices(string.ascii_uppercase + string.digits, k=20))
seed = transform(seed_txt)
seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

# First, run the seed forward to prime the state of the model.
prediction_model.reset_states()
for i in range(len(seed_txt) - 1):
    prediction_model.predict(seed[:, i:i + 1])

# Now we can accumulate predictions!
predictions = [seed[:, -1:]]
for i in range(PREDICT_LEN):
    last_word = predictions[-1]
    next_probits = prediction_model.predict(last_word)[:, 0, :]
  
  # sample from our output distribution
    next_idx = [
        np.random.choice(256, p=next_probits[i])
        for i in range(BATCH_SIZE)
    ]
    predictions.append(np.asarray(next_idx, dtype=np.int32))
    
for i in range(BATCH_SIZE):
    print('PREDICTION %d\n\n' % i)
    p = [predictions[j][i] for j in range(PREDICT_LEN)]
    generated = ''.join([chr(c) for c in p])
    print(generated)
    print()
    assert len(generated) == PREDICT_LEN, 'Generated text too short'

PREDICTION 0


7ron Sauvignon Blanc 2014
CoxchaKniw Oilelta Drapoyo 2017
Vina Chert Henrled Vintner Pardr Pinot Noir 2001
Harmwake Phite Chardonnay 2012
Alte Dare Ca Rito Viogu 2017
Blakig Mounton Vina Grise (Futures du Grand Estate 2014
Jamone Maruso Villagi d'Arb

PREDICTION 1


7is Carrair Cabernet Sauvignon 2014
Canter Cabernet Sauvignon 2012
Pewis Gramvan Solan Zincs Malbec 1994
Tlat Mayon Cabern Chardonnay 2013
M. Chapa Pinot Noir 2008
Montarra Farls Cote Treviers Stermiin Vineyard Pinot Noir 2015
Tormor Mise Sauvignon B

PREDICTION 2


7rifors)
ReOd Estre Red d'Or Moutir de Vauventer Vineyard Ext Belle Vineyard Pinot Noir 2014
Terred Moncanay 2004
Nichiol Brheuth Cabernet Sauvignon 1998
Barnasco Muciner Bruz Zinfandel 2000
Holby Alepusco de Hervay Cabernet Sauvignon 1998
Quarta Ric

PREDICTION 3


7ilt Zinfandel 2001
Klain Cabernet Shiraz 2001
Ville Heymeffiel Ricia 2014
Domaine Prematt Liomaine Vineyard Fares Vineyard Pinot Noir 2018
Dan-Am. Charbynon 2012
Irnois Orerin Cote
La

# Create larger fake wine name list

In [12]:
BATCH_SIZE = 5
PREDICT_LEN = 150
TOTAL_BATCHES = 1000

# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
prediction_model = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
prediction_model.load_weights(MODEL_WEIGHTS_PATH)

fake_names = []
for ii in tqdm(range(TOTAL_BATCHES)):
    seed_txt = ''.join(random.choices(string.ascii_uppercase + string.digits, k=20))
    seed = transform(seed_txt)
    seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)

    # First, run the seed forward to prime the state of the model.
    prediction_model.reset_states()
    for i in range(len(seed_txt) - 1):
        prediction_model.predict(seed[:, i:i + 1])

    # Now we can accumulate predictions!
    predictions = [seed[:, -1:]]
    for i in range(PREDICT_LEN):
        last_word = predictions[-1]
        next_probits = prediction_model.predict(last_word)[:, 0, :]

      # sample from our output distribution
        next_idx = [
            np.random.choice(256, p=next_probits[i])
            for i in range(BATCH_SIZE)
        ]
        predictions.append(np.asarray(next_idx, dtype=np.int32))

    for i in range(BATCH_SIZE):
        #print('PREDICTION %d\n\n' % i)
        p = [predictions[j][i] for j in range(PREDICT_LEN)]
        generated = ''.join([chr(c) for c in p])
        gen_list = generated.split('\n')[1:-1]
        for item in gen_list:
            fake_names.append(item)
        assert len(generated) == PREDICT_LEN, 'Generated text too short'

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [36:32<00:00,  2.19s/it]


In [13]:
fileName = "fake_names_{}_{}.pickle".format(len(fake_names),str(dt.date.today()))
pd.to_pickle(fake_names, 'data/fake/' + fileName)
print("Saved {}".format(fileName))

Saved fake_names_13301_2020-05-20.pickle


In [19]:
file_path = "data/fake/"
file_name = "fake_names_{}_{}.csv".format(len(fake_names),str(dt.date.today()))
print(f"Saving {len(fake_names):,} fake names to {file_path + file_name}")
pd.Series(fake_names).to_csv(f"{file_path + file_name}")

Saving 13,301 fake names to data/fake/fake_names_13301_2020-05-20.csv


# Compute fake wine name similarity

In [None]:
########## This code is super slow if matching large lists ##########

from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

#fake_names = test['name']
real_names = pd.read_pickle(SCRAPED_WINES_INPUT_PATH)['name']

fake_scores = {}
for f_name in tqdm(fake_names):
    max_score = 0.0
    for r_name in real_names:
        score = similar(f_name, r_name)
        if score > max_score:
            max_score = score
    fake_scores[f_name] = max_score
        
    
fake_scores = pd.Series(fake_scores)

########## This code is super slow if matching large lists ##########

### ^^ Above code takes ~8 hours to run, the files in the extra_code directory will split it out among 8 processes

# Check for wine names that match real ones

In [None]:
fake_scores = pd.Series(pd.read_pickle('new_scores.pickle')[0])

import matplotlib.pyplot as plt
plt.style.use('default')
print("Total fake names: ",len(fake_scores))
print("Total 90% or less match: ",len(fake_scores[fake_scores < 0.9]))

plt.figure(dpi=100)
plt.title('Disitribution of fake wine names to real ones')
plt.hist(fake_scores.values)
plt.show()