In [None]:
# Borrowed from: https://github.com/dennybritz/cnn-text-classification-tf

In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn

  from ._conv import register_converters as _register_converters


# Data Preparation

In [3]:
# Data Preparation
# ==================================================

# Data loading params
dev_sample_percentage = .1
low_data_file = "./data/rt-polaritydata/win.low"
medium_data_file = "./data/rt-polaritydata/win.medium"
high_data_file = "./data/rt-polaritydata/win.high"


x_text, y = data_helpers.load_data_and_labels(low_data_file, 
                                              medium_data_file,
                                              high_data_file)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

del x, y, x_shuffled, y_shuffled

print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

Vocabulary Size: 7675
Train/Dev split: 10295/1143


In [4]:
# Spot checking word ids creation
print(x_text[0])
print(x_train[0])

give brady the jags o line and this game is so different zero time in the pocket this game so far
[   7   11   34    3    4 2422  286   41   75  483   20 1376  382 2884
 2029  594 1015  185    3  912   14    7  141   27 3958   23  892    7
  212   47  737   41 2404    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


# Naive Bayes Baseline

In [26]:
print(x_train.shape)
print(y_train.shape)
y_train_nb = y_train[:,2]*2+y_train[:,1]
y_dev_nb = y_dev[:,2]*2+y_dev[:,1]
print(y_train_nb.shape)

(10295, 142)
(10295, 3)
(10295,)


In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
nb = MultinomialNB()
nb.fit(x_train, y_train_nb)
y_pred = nb.predict(x_dev)

acc = accuracy_score(y_dev_nb, y_pred)
print("Accuracy on test set: {:.02%}".format(acc))

Accuracy on test set: 37.27%


# Most Common Class Baseline Model

In [36]:
# Most common percent
from collections import Counter
list_cnts = Counter(y_train_nb)
highest = sorted(list_cnts, key = list_cnts.get, reverse = True)[:1]
highest_percent = list_cnts[highest[0]]/len(y_train_nb)
print(highest_percent)

# Accuracy for test data
list_cnts = Counter(y_dev_nb)
highest = sorted(list_cnts, key = list_cnts.get, reverse = True)[:1]
highest_percent = list_cnts[highest[0]]/len(y_dev)
print(highest_percent)

0.3682370082564352
0.36132983377077865


# CNN Model

In [30]:
# Parameters
# ==================================================

# Model Hyperparameters
embedding_dim = 128
filter_sizes = "3,4,5"
num_filters = 128
dropout_keep_prob = 0.5
l2_reg_lambda = 0.0

# Training parameters
batch_size = 64 
num_epochs = 5 #200
evaluate_every = 100
checkpoint_every = 100
num_checkpoints = 5

# Misc Parameters
allow_soft_placement = True
log_device_placement = False

In [31]:
# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=allow_soft_placement,
      log_device_placement=log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=embedding_dim,
            filter_sizes=list(map(int, filter_sizes.split(","))),
            num_filters=num_filters,
            l2_reg_lambda=l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        batches = data_helpers.batch_iter(
            list(zip(x_train, y_train)), batch_size, num_epochs)
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev, writer=dev_summary_writer)
                print("")
            if current_step % checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

INFO:tensorflow:Summary name embedding/W:0/grad/hist is illegal; using embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name embedding/W:0/grad/sparsity is illegal; using embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/sparsity is illegal; using conv-maxpool-3/b_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/hist is illegal; using conv-maxpool-4/W_0/grad/hist instead.
INFO:tens

2018-03-19T07:18:31.388576: step 92, loss 2.19109, acc 0.265625
2018-03-19T07:18:31.908727: step 93, loss 1.69365, acc 0.296875
2018-03-19T07:18:32.369183: step 94, loss 2.50262, acc 0.28125
2018-03-19T07:18:32.865938: step 95, loss 2.12053, acc 0.265625
2018-03-19T07:18:33.339869: step 96, loss 1.84941, acc 0.453125
2018-03-19T07:18:33.833016: step 97, loss 1.94382, acc 0.328125
2018-03-19T07:18:34.316673: step 98, loss 1.61944, acc 0.421875
2018-03-19T07:18:34.790082: step 99, loss 2.00684, acc 0.375
2018-03-19T07:18:35.240064: step 100, loss 2.02141, acc 0.40625

Evaluation:
2018-03-19T07:18:37.874239: step 100, loss 1.28262, acc 0.381452

Saved model checkpoint to /Users/kwheatley/Desktop/w266_nfl/cnn-text-classification-tf/runs/1521461861/checkpoints/model-100

2018-03-19T07:18:39.028634: step 101, loss 1.31917, acc 0.5
2018-03-19T07:18:39.531739: step 102, loss 1.73949, acc 0.359375
2018-03-19T07:18:40.091731: step 103, loss 1.47552, acc 0.359375
2018-03-19T07:18:40.550308: step 

2018-03-19T07:19:37.722539: step 214, loss 1.25005, acc 0.484375
2018-03-19T07:19:38.202815: step 215, loss 1.53457, acc 0.375
2018-03-19T07:19:38.666247: step 216, loss 1.43069, acc 0.328125
2018-03-19T07:19:39.119472: step 217, loss 1.37477, acc 0.484375
2018-03-19T07:19:39.560719: step 218, loss 1.475, acc 0.40625
2018-03-19T07:19:40.034343: step 219, loss 1.26043, acc 0.375
2018-03-19T07:19:40.491222: step 220, loss 1.38957, acc 0.421875
2018-03-19T07:19:40.972694: step 221, loss 1.37557, acc 0.421875
2018-03-19T07:19:41.410185: step 222, loss 1.50845, acc 0.390625
2018-03-19T07:19:41.859106: step 223, loss 1.39171, acc 0.40625
2018-03-19T07:19:42.321211: step 224, loss 1.52559, acc 0.328125
2018-03-19T07:19:42.763103: step 225, loss 1.30942, acc 0.390625
2018-03-19T07:19:43.209146: step 226, loss 1.28004, acc 0.53125
2018-03-19T07:19:43.650395: step 227, loss 1.46537, acc 0.46875
2018-03-19T07:19:44.135255: step 228, loss 1.74854, acc 0.3125
2018-03-19T07:19:44.610263: step 229, l

2018-03-19T07:20:47.258173: step 340, loss 1.06754, acc 0.515625
2018-03-19T07:20:47.847099: step 341, loss 1.20449, acc 0.421875
2018-03-19T07:20:48.433540: step 342, loss 0.968376, acc 0.546875
2018-03-19T07:20:49.003527: step 343, loss 1.03139, acc 0.53125
2018-03-19T07:20:49.591666: step 344, loss 1.13274, acc 0.484375
2018-03-19T07:20:50.231172: step 345, loss 1.20584, acc 0.40625
2018-03-19T07:20:50.791406: step 346, loss 1.20376, acc 0.484375
2018-03-19T07:20:51.557271: step 347, loss 0.950345, acc 0.5
2018-03-19T07:20:52.221001: step 348, loss 1.08113, acc 0.515625
2018-03-19T07:20:52.836069: step 349, loss 0.945372, acc 0.625
2018-03-19T07:20:53.640776: step 350, loss 1.13923, acc 0.484375
2018-03-19T07:20:54.235947: step 351, loss 1.04128, acc 0.53125
2018-03-19T07:20:55.129844: step 352, loss 1.19931, acc 0.4375
2018-03-19T07:20:55.941654: step 353, loss 1.03074, acc 0.46875
2018-03-19T07:20:56.655459: step 354, loss 0.986941, acc 0.546875
2018-03-19T07:20:57.246628: step 35

2018-03-19T07:21:53.889970: step 465, loss 0.9485, acc 0.546875
2018-03-19T07:21:54.405383: step 466, loss 1.11538, acc 0.4375
2018-03-19T07:21:55.034483: step 467, loss 1.04952, acc 0.53125
2018-03-19T07:21:55.502721: step 468, loss 0.999691, acc 0.46875
2018-03-19T07:21:55.978723: step 469, loss 1.08858, acc 0.359375
2018-03-19T07:21:56.448383: step 470, loss 1.10084, acc 0.40625
2018-03-19T07:21:56.951507: step 471, loss 1.14486, acc 0.390625
2018-03-19T07:21:57.466072: step 472, loss 1.03306, acc 0.453125
2018-03-19T07:21:57.961069: step 473, loss 1.16259, acc 0.390625
2018-03-19T07:21:58.537816: step 474, loss 1.08232, acc 0.421875
2018-03-19T07:21:59.190658: step 475, loss 1.14241, acc 0.3125
2018-03-19T07:21:59.721152: step 476, loss 0.979571, acc 0.625
2018-03-19T07:22:00.180725: step 477, loss 1.01086, acc 0.46875
2018-03-19T07:22:00.693360: step 478, loss 1.05637, acc 0.5
2018-03-19T07:22:01.202811: step 479, loss 1.18116, acc 0.4375
2018-03-19T07:22:01.674972: step 480, loss

2018-03-19T07:22:59.195357: step 590, loss 0.976144, acc 0.546875
2018-03-19T07:22:59.762249: step 591, loss 0.878112, acc 0.671875
2018-03-19T07:23:00.205624: step 592, loss 1.05281, acc 0.453125
2018-03-19T07:23:00.643395: step 593, loss 1.09133, acc 0.453125
2018-03-19T07:23:01.093178: step 594, loss 1.20056, acc 0.46875
2018-03-19T07:23:01.575012: step 595, loss 0.866691, acc 0.53125
2018-03-19T07:23:02.168305: step 596, loss 0.860682, acc 0.5625
2018-03-19T07:23:02.789722: step 597, loss 0.935743, acc 0.5625
2018-03-19T07:23:03.609932: step 598, loss 0.99508, acc 0.578125
2018-03-19T07:23:04.198009: step 599, loss 0.98832, acc 0.59375
2018-03-19T07:23:04.840210: step 600, loss 1.07034, acc 0.4375

Evaluation:
2018-03-19T07:23:07.682663: step 600, loss 1.07312, acc 0.419948

Saved model checkpoint to /Users/kwheatley/Desktop/w266_nfl/cnn-text-classification-tf/runs/1521461861/checkpoints/model-600

2018-03-19T07:23:08.788546: step 601, loss 1.03516, acc 0.46875
2018-03-19T07:23:09.

2018-03-19T07:24:11.672567: step 711, loss 0.851408, acc 0.609375
2018-03-19T07:24:12.305492: step 712, loss 0.97621, acc 0.515625
2018-03-19T07:24:12.802431: step 713, loss 0.899176, acc 0.53125
2018-03-19T07:24:13.262804: step 714, loss 0.907703, acc 0.578125
2018-03-19T07:24:13.721444: step 715, loss 0.880114, acc 0.625
2018-03-19T07:24:14.164752: step 716, loss 0.918363, acc 0.53125
2018-03-19T07:24:14.611972: step 717, loss 0.892787, acc 0.515625
2018-03-19T07:24:15.159019: step 718, loss 0.926338, acc 0.53125
2018-03-19T07:24:15.827879: step 719, loss 1.00134, acc 0.578125
2018-03-19T07:24:16.428266: step 720, loss 0.895541, acc 0.546875
2018-03-19T07:24:17.079491: step 721, loss 0.890483, acc 0.65625
2018-03-19T07:24:17.858338: step 722, loss 0.915272, acc 0.625
2018-03-19T07:24:18.477505: step 723, loss 0.988623, acc 0.515625
2018-03-19T07:24:19.014238: step 724, loss 1.0209, acc 0.515625
2018-03-19T07:24:19.487490: step 725, loss 0.987526, acc 0.53125
2018-03-19T07:24:19.94177