# Analyzing Data

We will need to clean and format our data for training. By getting some statistics and visualization for our data we will do a better job of accounting for missing data and augmenting existing features for training.

In [None]:
wine_utils = du.DataHelper()
wine_data = wine_utils.load_data()

In [None]:
train_df = wine_data[0]
train_df

In [None]:
for feature in train_df:
    total_entries =  len(train_df[feature])
    Num_Nan = train_df[feature].isnull().sum()
    print("Feature: {0} | Percent of NaN {1}".format(feature, float(Num_Nan)/float(total_entries)))

# Load Data and Embeddings

In [1]:
from lstm import RNNModel, Config, pad_sequences
import embeddings as emb
import tensorflow as tf
import data_utils as du
import pandas as pd

emb_helper = emb.embedding_helper(save_to_pickle = False)

947it [00:00, 9467.73it/s]

Loading Glove Embeddings:  /Users/LorenAC/AI/CS224N/FinalProjectRepo/Wine-NLP/embeddings/glove.42B.300d.txt


1917494it [03:20, 9586.14it/s]

Done. 1917495  words loaded!





In [2]:
data_helper = du.DataHelper(1000)

X_train_df, X_dev_df = data_helper.X_train, data_helper.X_dev
label_helper_points = data_helper.labels_from_Y_cat("points")

# Get sub-embeddings for our given vocab

If we dont do this the embedding matrix may be too large too give to tensorflow graph

In [3]:
vocab, _ = data_helper.generate_vocab_and_word_frequencies() 
sub_emb_matrix, sub_tok2ind,sub_ind2tok, sub_unk_ind = emb_helper.get_sub_embeddings(vocab)

100%|██████████| 7925/7925 [00:00<00:00, 127435.86it/s]

Done. 7594  words loaded!





# Get our data ready for model preprocessing

This includes:

    1) encoding the words in our sentences as indices in the embedding matrix
    
    2) mapping labels into classes and getting into proper format
    
(here we use our label_helper which keeps track of the mapping between a label's value from dataset and its corresponding class number for training...this helper also stores the number of classes for the label which we will later pass to our model config)

In [4]:
import numpy as np
import time

X_train_tokens = X_train_df.as_matrix()
X_dev_tokens = X_dev_df.as_matrix()

X_train_indices = emb_helper.tok2ind_ind2tok(X_train_tokens, lookup_dict = sub_tok2ind, unk_indice = sub_unk_ind)
X_dev_indices = emb_helper.tok2ind_ind2tok(X_dev_tokens, lookup_dict = sub_tok2ind, unk_indice = sub_unk_ind)

train_raw = [X_train_indices, label_helper_points.train_classes]
dev_raw = [X_dev_indices, label_helper_points.dev_classes]

# Build and Run Model

This step includes:

    1) initializing our Config, Model

    2) preprocessing data further using model parameters

    3) opening our tensorflow Graph and Session

In [5]:
from util import write_conll, print_sentence

config = Config("lstm", n_classes = label_helper_points.num_classes, many2one = True)
embeddings = sub_emb_matrix
embeddings = np.asarray(embeddings)
config.embed_size = embeddings.shape[1]

with tf.Graph().as_default():
    print("Building model...",)
    start = time.time()
    model = RNNModel(data_helper, config, embeddings)
    print("took %.2f seconds", time.time() - start)
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as session:
        session.run(init)
        model.fit(session, saver, train_raw, dev_raw)
        output = model.output(session, dev_raw)
        sentences, class_labels, predictions = zip(*output)
        predictions = [[str(label_helper_points.class_2_lbl[cls]) for cls in preds] for preds in predictions]
        labels = [[str(label_helper_points.class_2_lbl[cls]) for cls in classes] for classes in class_labels]
        label_results = zip(labels, predictions)
        sentences = emb_helper.tok2ind_ind2tok(sentences, lookup_dict = sub_ind2tok, unk_indice = sub_unk_ind)
        output = zip(sentences, labels, predictions)
        
        #with open(model.config.conll_output, 'w') as f:
        #    write_conll(f, output)
        with open(model.config.eval_output, 'w') as f:
            for sentence, label, prediction in output:
                print_sentence(f, sentence, label, prediction)
                

Building model...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


took %.2f seconds 2.034195899963379
Epoch %d out of %d 1 30
Loss:  2.6435738

Accuracy | Train: %f , Dev: %f (0.128, 0.127)
New best accuracy! Saving model in %s results/lstm/20180315_013407/model.weights
Epoch %d out of %d 2 30
Loss:  2.5478263

Accuracy | Train: %f , Dev: %f (0.129, 0.127)
Epoch %d out of %d 3 30
Loss:  2.460935

Accuracy | Train: %f , Dev: %f (0.205, 0.185)
New best accuracy! Saving model in %s results/lstm/20180315_013407/model.weights
Epoch %d out of %d 4 30
Loss:  2.3776398

Accuracy | Train: %f , Dev: %f (0.178, 0.139)
Epoch %d out of %d 5 30
Loss:  2.296639

Accuracy | Train: %f , Dev: %f (0.262, 0.144)
Epoch %d out of %d 6 30
Loss:  2.158554

Accuracy | Train: %f , Dev: %f (0.326, 0.167)
Epoch %d out of %d 7 30
Loss:  2.0125046

Accuracy | Train: %f , Dev: %f (0.344, 0.152)
Epoch %d out of %d 8 30
Loss:  1.873337

Accuracy | Train: %f , Dev: %f (0.334, 0.15)
Epoch %d out of %d 9 30
Loss:  1.7302176

Accuracy | Train: %f , Dev: %f (0.412, 0.166)
Epoch %d out of

IndexError: list index out of range

In [None]:
dev_raw

# Define General Model for Running in TensorFlow

In [None]:
def do_evaluate(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    input_data = read_conll(args.data)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = RNNModel(helper, config, embeddings)

        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            saver.restore(session, model.config.model_output)
            for sentence, labels, predictions in model.output(session, input_data):
                predictions = [LBLS[l] for l in predictions]
                print_sentence(args.output, sentence, labels, predictions)

In [None]:
x = [[3], [5], [5]]
y = [5]
x = np.asarray(x)
x = x + [y]*4
#np.expand_dims(x, 2).shape
x

In [None]:
X_train

In [13]:
x = [86, 1]
np.expand_dims(x,1)

array([[86],
       [ 1]])

In [119]:
import numpy as np
ex = np.array([[[0, 1, 2], [1, 5, 6], [2, 3, 2]], [[3, 3, 4], [4, 4, 3], [5, 1, 2]]])
labels = np.array([[0], [3]])
mask = np.array([[True, True, False], [True, True, False]])

In [120]:
print(ex.shape, labels.shape)

(2, 3, 3) (2, 1)


In [121]:
ex_masked = ex[mask]
#lab_masked = labels[mask]

In [124]:
print(ex, ex_masked)

[[[0 1 2]
  [1 5 6]
  [2 3 2]]

 [[3 3 4]
  [4 4 3]
  [5 1 2]]] [[0 1 2]
 [1 5 6]
 [3 3 4]
 [4 4 3]]


In [125]:
print(ex_masked.shape, labels.shape)

(4, 3) (2, 1)


In [118]:
y = y==0

In [67]:
y

array([[ True,  True,  True,  True,  True,  True,  True, False, False,
        False],
       [ True,  True,  True,  True,  True,  True,  True, False, False,
        False],
       [ True,  True,  True,  True,  True,  True,  True, False, False,
        False]])

In [68]:
x = x[y]

In [69]:
x.shape

(21, 5)

In [70]:
np.reshape(x, (3, -1, 5))

array([[[3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [0., 0., 0., 0., 0.]],

       [[3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [0., 0., 0., 0., 0.]],

       [[3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [0., 0., 0., 0., 0.]]])