# Defining Dependencies

We will be using tensorflow for building our LSTM

In [7]:
import tensorflow as tf
import data_utils as du
import pandas as pd

# Analyzing Data

We will need to clean and format our data for training. By getting some statistics and visualization for our data we will do a better job of accounting for missing data and augmenting existing features for training.

In [11]:
wine_utils = du.data_utils()
wine_data = wine_utils.load_data()

In [12]:
train_df = wine_data[0]
train_df

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
114630,114630,France,"[This, mildly, aromatic, blend, of, Grenache, ...",Les Galets Dorés,86,14.0,Rhône Valley,Costières de Nîmes,,Joe Czerwinski,@JoeCz,Chateau Mourgues du Gres 2010 Les Galets Dorés...,Rhône-style White Blend,Chateau Mourgues du Gres
39275,39275,US,"[Bright, and, alive, in, crisp, pear, and, fre...",Limited Release,91,35.0,California,Los Carneros,Napa-Sonoma,Virginie Boone,@vboone,Black Stallion 2012 Limited Release Chardonnay...,Chardonnay,Black Stallion
34144,34144,US,"[This, pure, Malbec, carries, spicy, ,, slight...",Pepper Bridge Vineyard,90,32.0,Washington,Walla Walla Valley (WA),Columbia Valley,Paul Gregutt,@paulgwine,Walla Walla Vintners 2009 Pepper Bridge Vineya...,Malbec,Walla Walla Vintners
14243,14243,US,"[A, toasty, streak, of, espresso, -, flavored,...",Corral Creek Vineyards,88,48.0,Oregon,Chehalem Mountains,Willamette Valley,Paul Gregutt,@paulgwine,Chehalem 2010 Corral Creek Vineyards Pinot Noi...,Pinot Noir,Chehalem
86564,86564,US,"[The, grapes, come, from, the, Applegate, vall...",,85,10.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Bridgeview 2000 Merlot (Oregon),Merlot,Bridgeview
9390,9390,Argentina,"[This, ultraripe, Malbec, -, led, blend, sugge...",Alto,93,85.0,Mendoza Province,Mendoza,,Michael Schachner,@wineschach,Alta Vista 2011 Alto Red (Mendoza),Red Blend,Alta Vista
120860,120860,Australia,"[Penfolds', Bin, 51, is, a, perennial, winner,...",Bin 51,91,40.0,South Australia,Eden Valley,,Joe Czerwinski,@JoeCz,Penfolds 2014 Bin 51 Riesling (Eden Valley),Riesling,Penfolds
2825,2825,US,"[A, blend, of, six, different, varieties, with...",,86,12.0,Washington,Columbia Valley (WA),Columbia Valley,Sean P. Sullivan,@wawinereport,Charles & Charles 2014 Rosé (Columbia Valley (...,Rosé,Charles & Charles
125480,125480,US,"[A, delightful, southern, Rhône, -, style, win...",Le P'tit Pape,90,22.0,California,San Benito County,Central Coast,,,Le P'tit Paysan 2011 Le P'tit Pape G-S-M (San ...,G-S-M,Le P'tit Paysan
100278,100278,US,"[The, nose, on, this, wine, by, Burgundy, -, r...",La Pristina,93,30.0,California,Edna Valley,Central Coast,Matt Kettmann,@mattkettmann,Baileyana 2014 La Pristina Chardonnay (Edna Va...,Chardonnay,Baileyana


In [13]:
for feature in train_df:
    total_entries =  len(train_df[feature])
    Num_Nan = train_df[feature].isnull().sum()
    print("Feature: {0} | Percent of NaN {1}".format(feature, float(Num_Nan)/float(total_entries)))

Feature: Unnamed: 0 | Percent of NaN 0.0
Feature: country | Percent of NaN 0.0004946196374987635
Feature: description | Percent of NaN 0.0
Feature: designation | Percent of NaN 0.28924257246177687
Feature: points | Percent of NaN 0.0
Feature: price | Percent of NaN 0.06885105353982787
Feature: province | Percent of NaN 0.0004946196374987635
Feature: region_1 | Percent of NaN 0.162004418602095
Feature: region_2 | Percent of NaN 0.610371624220974
Feature: taster_name | Percent of NaN 0.20136515019949658
Feature: taster_twitter_handle | Percent of NaN 0.23946185383440136
Feature: title | Percent of NaN 0.0
Feature: variety | Percent of NaN 1.0991547499972522e-05
Feature: winery | Percent of NaN 0.0


# Define General Model for Running in TensorFlow

In [None]:
def do_train(args):
    # Set up some parameters.
    config = Config(args)
    helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]
    helper.save(config.output_path)

    report = None #Report(Config.eval_output)

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = RNNModel(helper, config, embeddings)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            model.fit(session, saver, train, dev)
            if report:
                report.log_output(model.output(session, dev_raw))
                report.save()
            else:
                # Save predictions in a text file.
                output = model.output(session, dev_raw)
                sentences, labels, predictions = zip(*output)
                predictions = [[LBLS[l] for l in preds] for preds in predictions]
                output = zip(sentences, labels, predictions)

                with open(model.config.conll_output, 'w') as f:
                    write_conll(f, output)
                with open(model.config.eval_output, 'w') as f:
                    for sentence, labels, predictions in output:
                        print_sentence(f, sentence, labels, predictions)

def do_evaluate(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    input_data = read_conll(args.data)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = RNNModel(helper, config, embeddings)

        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            saver.restore(session, model.config.model_output)
            for sentence, labels, predictions in model.output(session, input_data):
                predictions = [LBLS[l] for l in predictions]
                print_sentence(args.output, sentence, labels, predictions)