In [16]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np
import pandas as pd

In [35]:
tokenizer = Tokenizer()

def dataset_preparation(data):

    # basic cleanup
    try:
        corpus = data.lower().split("\n")
    except:
        corpus = data
    

    # tokenization    
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    # create input sequences using list of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len, total_words

def create_model(predictors, label, max_sequence_len, total_words):
    
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
    model.add(LSTM(150, return_sequences = True))
    # model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
    model.fit(predictors, label, epochs=3, verbose=1, callbacks=[earlystop])
    print(model.summary())
    return model 

def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text



data = open('data.txt').read()
names, data = pd.read_pickle('data/descriptions.pickle')


predictors, label, max_sequence_len, total_words = dataset_preparation(data)
model = create_model(predictors, label, max_sequence_len, total_words)
print(generate_text("This wine is ", 50, max_sequence_len))

KeyboardInterrupt: 

In [29]:

data = open('data.txt').read()
names, data = pd.read_pickle('data/descriptions.pickle')

#corpus = data.lower().split("\n")
corpus = data

# tokenization	
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# create input sequences using list of tokens
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences2 = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words)

In [37]:
names

['Hall Napa Valley Cabernet Sauvignon 2013 ',
 'Rombauer Chardonnay 2017 ',
 'Antinori Tignanello 2015 ',
 'Borne of Fire Cabernet Sauvignon 2016 ',
 'Torbreck Woodcutters Shiraz 2017 ',
 'Bargetto Pommard Clone Pinot Noir 2015 ',
 'Peter Mathis Rose of Grenache 2017 ',
 'Castoro Cellars Merlot 2012 ',
 'Bertrand Galbrun Chatrois Cabernet Franc 2015 ',
 'Vina Mein Ribeiro 2016 ',
 'Bethel Heights Estate Grown Chardonnay 2013 ',
 'Dourthe La Grande Cuvee Sauvignon Blanc 2017 ',
 'Zuccardi Serie A Bonarda 2016 ',
 "Nugan Estate Frasca's Lane Vineyard Chardonnay 2016 ",
 'Stanton Vineyards Oakville Cabernet Sauvignon 2014 ',
 'Krug Grande Cuvee Brut (166th Edition) ',
 'Sebastiani North Coast Cabernet Sauvignon 2016 ',
 'Hecht &amp; Bannier Minervois 2013 ',
 'Vigilance Sauvignon Blanc 2018 ',
 'Crosby Cabernet Sauvignon 2016 ',
 'Dashe Dry Creek Zinfandel (375ML half-bottle) 2016 ',
 'Zuccardi Tito Zuccardi 2014 ',
 'Joseph Drouhin Volnay 2016 ',
 'Maipe Reserve Cabernet Sauvignon 2016 '